Data exploration of temporal and spatial patterns in crime data from San Francisco. Lastly, applying and comparing two classification algorithms - K nearest neighbor and Random Forest - for predicting crime type
Import libraries
import pandas as pd
import numpy as np
import random
# libraries for viz
import folium
import matplotlib.pyplot as plt
import seaborn as sns
sns.set() # Set searborn as default
color = "firebrick"
# machine learning models from sklearn lib
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
No collection needed.
# read crime data
df = pd.read_csv('data/sf_data.csv',delimiter=';')
# read distict data
dis = pd.read_csv('data/sf_districts.csv',delimiter=';')
# merge two data sets on 'id'
df = pd.merge(df,dis, on='id',how='left')
df.head(3)
| id | category | description | weekday | date | time | resolution | longitude | latitude | label | district | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5069701104134 | assault | battery | wednesday | 06/22/2005 | 12:20 | none | -122.428223 | 37.781896 | violent | NaN |
| 1 | 6074729204104 | assault | assault | saturday | 07/15/2006 | 00:55 | none | -122.410672 | 37.799789 | violent | NaN |
| 2 | 7103536315201 | assault | stalking | tuesday | 09/25/2007 | 00:01 | none | -122.458226 | 37.741362 | violent | NaN |
print('Rows: ',len(df))
print('Columns: ',len(df.columns))
Rows: 2129216 Columns: 11
Check for missing values and replace those in district with 'Unknown'. Change time variables to datetime type.
# Get missing values and data types
d = {'NaN values': df.isna().sum(),'Data types': df.dtypes}
check = pd.DataFrame(data=d)
# replace NaNs with 'Unknown' district
df['district'] = df['district'].fillna('unknown')
check['NaN values (2.0)'] = df.isna().sum()
# change date to datetime
df['date'] = pd.to_datetime(df.date)
check['New data types'] = df.dtypes
print('Time range:', df.date.min(),'-',df.date.max())
display(check)
Time range: 2003-01-01 00:00:00 - 2018-05-15 00:00:00
| NaN values | Data types | NaN values (2.0) | New data types | |
|---|---|---|---|---|
| id | 0 | int64 | 0 | int64 |
| category | 0 | object | 0 | object |
| description | 0 | object | 0 | object |
| weekday | 0 | object | 0 | object |
| date | 0 | object | 0 | datetime64[ns] |
| time | 0 | object | 0 | object |
| resolution | 0 | object | 0 | object |
| longitude | 0 | float64 | 0 | float64 |
| latitude | 0 | float64 | 0 | float64 |
| label | 0 | object | 0 | object |
| district | 1536362 | object | 0 | object |
Adding time variables
# Add variables for single time measures
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['hour'] = pd.to_datetime(df.time)
df['hour'] = df['hour'].dt.round('H').dt.hour
# add weekday as number
weekdays = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday','sunday']
weekdaysIndex = dict(zip(weekdays,range(len(weekdays))))
df['weekday_num'] = df['weekday']
df.replace({"weekday_num": weekdaysIndex},inplace=True)
display(df.head(3))
| id | category | description | weekday | date | time | resolution | longitude | latitude | label | district | year | month | hour | weekday_num | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5069701104134 | assault | battery | wednesday | 2005-06-22 | 12:20 | none | -122.428223 | 37.781896 | violent | unknown | 2005 | 6 | 12 | 2 |
| 1 | 6074729204104 | assault | assault | saturday | 2006-07-15 | 00:55 | none | -122.410672 | 37.799789 | violent | unknown | 2006 | 7 | 1 | 5 |
| 2 | 7103536315201 | assault | stalking | tuesday | 2007-09-25 | 00:01 | none | -122.458226 | 37.741362 | violent | unknown | 2007 | 9 | 0 | 1 |
Exploring overall counts as well as temporal and spatial trends for the crimes
Define function to create barchart
# outputs barchart of count of crimes
# takes as input the variable we want to see the number of crimes for, then if horizontal or not and size of plot
def dist_overall(var,hz,size):
df_cat = df.groupby(var).count()
df_cat = df_cat.sort_values(by='id',ascending=True)
if var=='year':
df_cat = df_cat.sort_values(by='year',ascending=True)[:-1]
plt.figure(figsize=size)
if hz==True:
plt.barh(df_cat.index,df_cat.id,color=color)
plt.xlabel('Count')
else:
plt.bar([str(i) for i in df_cat.index],df_cat.id,color=color)
plt.ylabel('Count')
plt.title(f'Count of crimes for each {var}',fontsize=16)
plt.xticks(rotation=0,fontsize=13)
plt.yticks(fontsize=13)
plt.show()
Make plots
dist_overall('category',True,(10,12))
dist_overall('district',False,(14,5))
dist_overall('year',False,(14,5))
dist_overall('label',False,(14,5))
Choose focus crimes
For simplicity we already narrow down to six different crime types:
# get counts of each category and filter out some crimes
df_cat = df.groupby('category').count().sort_values(by='id')
focus = df_cat[df_cat.id>9000].index
focus = focus.drop('other offenses').drop('non-criminal').drop('secondary codes').drop('larceny/theft').drop('suspicious occ').drop('stolen property')
print(list(focus))
focus = ['drug/narcotic','vandalism','fraud','trespass','prostitution','drunkenness']
# filter on focus crimes
df_foc = df[df.category.isin(focus)].reset_index(drop=True)
['drunkenness', 'disorderly conduct', 'prostitution', 'trespass', 'weapon laws', 'forgery/counterfeiting', 'fraud', 'missing person', 'robbery', 'burglary', 'warrants', 'vandalism', 'drug/narcotic', 'vehicle theft', 'assault']
Explore the temporal trends for different crime categories
Define function to create subplots
# this function takes a time variable as input
# then outputs a barchart for each crime category over that period
def dist_plot(time_var):
fig = plt.figure(figsize=(17,16))
for i,crime in enumerate(focus):
df_filt = df_foc[df_foc.category==crime]
df_filt2 = df_filt.groupby(time_var).count().sort_values(by=time_var)
ax = fig.add_subplot(5,3,i+1)
x=df_filt2.index
y =df_filt2.id
if time_var=='year':
plt.plot(x[:-1],y[:-1],color=color)
else:
plt.bar(x,y,color=color)
plt.title(crime,fontsize=14)
plt.ylabel('Count')
if time_var=='weekday_num':
plt.xticks(np.arange(7), ['Mon', 'Tue', 'Wed', 'Thur', 'Fri', 'Sat','Sun'],fontsize=12)
elif time_var =='month':
plt.xticks(np.arange(1,13),['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov','Dec'],fontsize=12)
elif time_var=='hour':
plt.xticks(np.arange(24),[str(i).zfill(2) for i in (np.arange(24))],fontsize=12)
plt.tight_layout()
Yearly development
Excluding year 2018 as only half the year is in the data
dist_plot('year')
Monthly trends
dist_plot('month')
Weekly trends
dist_plot('weekday_num')
24 hour cycle trends
dist_plot('hour')
Explore trends for location of crimes
OBS: we only have 3 districts - it would have been optimal to map coordinates within all districts of SF
Distrubtion of crimes in districts
dist_plot('district')
Map locations of chosen crimes
It is chosen to only focus on two crime types in year 2017.
# coordinates of San Francisco
SF_loc = [37.7792808, -122.4192363]
# colors for crimes
colors=['midnightblue','firebrick']
# create baseline map
m = folium.Map(location=SF_loc, zoom_start=11.5, tiles='Stamen Toner')
# plot locations
for crime,col in zip(focus[:2][::-1],colors):
# filter on crime
df_foc1 = df_foc[(df_foc.category==crime) & (df_foc.year==2015)]
#loop over each crime location
for i in range(len(df_foc1)):
folium.Circle(
location=[df_foc1.iloc[i]['latitude'], df_foc1.iloc[i]['longitude']],
radius=10,
fill=True,
color=col,
fill_opacity=0.2,
tooltip =crime
).add_to(m)
m